library(knitr)
Dataset_bA <- read.csv("C:/Users/Mideh/Downloads/Dataset_bA.csv")
# checking the structure and the first 7 rows of the dataset and renaming the dataset
head(Dataset_bA,7)
structure(Dataset_bA)
D_A <- Dataset_bA
# checking to see for blanks in the entire dataset and in the Cuisines column
sum(D_A == "", na.rm = TRUE)  
## [1] 9
sum(D_A$Cuisines == "")  
## [1] 9

There were 9 blanks in the dataset which were found in the Cuisines column

#cleaning the dataset by removing the NA
D_A[D_A == ""] <- NA               
D_A_clean <- na.omit(D_A)
# loading necesaary libraries
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)

Task 1: TOP CUISINES AND THEIR PERCENTAGES

#Split the 'Cuisines' column to handle multiple cuisines per restaurant
D_A_cuisines <- D_A_clean %>%
  separate_rows(Cuisines, sep = ",") %>%
  mutate(Cuisines = trimws(Cuisines))  # Remove extra spaces
print(D_A_cuisines)
## # A tibble: 19,710 × 21
##    Restaurant.ID Restaurant.Name        Country.Code City       Address Locality
##            <int> <chr>                         <int> <chr>      <chr>   <chr>   
##  1       6317637 Le Petit Souffle                162 Makati Ci… Third … Century…
##  2       6317637 Le Petit Souffle                162 Makati Ci… Third … Century…
##  3       6317637 Le Petit Souffle                162 Makati Ci… Third … Century…
##  4       6304287 Izakaya Kikufuji                162 Makati Ci… Little… Little …
##  5       6300002 Heat - Edsa Shangri-La          162 Mandaluyo… Edsa S… Edsa Sh…
##  6       6300002 Heat - Edsa Shangri-La          162 Mandaluyo… Edsa S… Edsa Sh…
##  7       6300002 Heat - Edsa Shangri-La          162 Mandaluyo… Edsa S… Edsa Sh…
##  8       6300002 Heat - Edsa Shangri-La          162 Mandaluyo… Edsa S… Edsa Sh…
##  9       6318506 Ooma                            162 Mandaluyo… Third … SM Mega…
## 10       6318506 Ooma                            162 Mandaluyo… Third … SM Mega…
## # ℹ 19,700 more rows
## # ℹ 15 more variables: Locality.Verbose <chr>, Longitude <dbl>, Latitude <dbl>,
## #   Cuisines <chr>, Average.Cost.for.two <int>, Currency <chr>,
## #   Has.Table.booking <chr>, Has.Online.delivery <chr>,
## #   Is.delivering.now <chr>, Switch.to.order.menu <chr>, Price.range <int>,
## #   Aggregate.rating <dbl>, Rating.color <chr>, Rating.text <chr>, Votes <int>
# Count occurrences of each cuisine
top_cuisines <- D_A_cuisines %>%
  group_by(Cuisines) %>% 
  summarise(restaurant_count = n_distinct(Restaurant.ID)) %>%  # Count distinct Restaurant_ID
  arrange(desc(restaurant_count)) %>%
  top_n(3, restaurant_count)

# Calculate percentage of restaurants serving each top cuisine
total_restaurants <- nrow(D_A_clean)
top_cuisines <- top_cuisines %>%
  mutate(percentage = (restaurant_count / total_restaurants) * 100)

# View results
print(top_cuisines)
## # A tibble: 3 × 3
##   Cuisines     restaurant_count percentage
##   <chr>                   <int>      <dbl>
## 1 North Indian             3960       41.5
## 2 Chinese                  2733       28.6
## 3 Fast Food                1986       20.8

Task 2: CITY ANALYSIS

# Step 1: Identify the City with the Highest Number of Restaurants

city_restaurant_count <- D_A_cuisines %>%
  group_by(City) %>%
  summarise(restaurant_count = n_distinct(Restaurant.ID)) %>%  # Count distinct Restaurant_ID
  arrange(desc(restaurant_count))

# Display the city with the highest number of restaurants
city_with_most_restaurants <- city_restaurant_count[1, ]
print(city_with_most_restaurants)
## # A tibble: 1 × 2
##   City      restaurant_count
##   <chr>                <int>
## 1 New Delhi             5473
# Step 2: Calculate the Average Rating for Restaurants in Each City

city_avg_rating <- D_A_cuisines %>%
  group_by(City) %>%
  summarise(average_rating = mean(Aggregate.rating, na.rm = TRUE)) %>%
  arrange(desc(average_rating))

# Display the average rating for each city
print(city_avg_rating)
## # A tibble: 140 × 2
##    City             average_rating
##    <chr>                     <dbl>
##  1 Inner City                 4.9 
##  2 Quezon City                4.8 
##  3 Makati City                4.72
##  4 Mandaluyong City           4.6 
##  5 Beechworth                 4.6 
##  6 Pasig City                 4.53
##  7 London                     4.53
##  8 Taguig City                4.53
##  9 Lincoln                    4.5 
## 10 Secunderabad               4.5 
## # ℹ 130 more rows
# Step 3: Determine the City with the Highest Average Rating

city_with_highest_avg_rating <- city_avg_rating[1, ]
print(city_with_highest_avg_rating)
## # A tibble: 1 × 2
##   City       average_rating
##   <chr>               <dbl>
## 1 Inner City            4.9

Task 3: RANGE DISTRIBUTION

library(ggplot2)
# Step 1: Create a Bar Chart of Price Range Distribution
# Assuming the 'Price range' column is already present in your dataset

ggplot(D_A_cuisines, aes(x = Price.range)) +
  geom_bar(fill = "yellow") +
  labs(title = "Price Range Distribution", x = "Price Range", y = "Number of Restaurants") +
  theme_minimal()

# Step 2: Calculate the Percentage of Restaurants in Each Price Range Category
price_range_distribution <- D_A_cuisines %>%
  group_by(Price.range) %>%
  summarise(count = n()) %>%
  mutate(percentage = (count / sum(count)) * 100)

# Print the price range distribution with percentages
print(price_range_distribution)
## # A tibble: 4 × 3
##   Price.range count percentage
##         <int> <int>      <dbl>
## 1           1  7428      37.7 
## 2           2  7133      36.2 
## 3           3  3758      19.1 
## 4           4  1391       7.06

Task 4: ONLINE DELIVERY

# Step 1: Determine the Percentage of Restaurants that Offer Online Delivery

# Calculate the percentage of restaurants that offer online delivery
online_delivery_percentage <- D_A_cuisines %>%
  group_by(Has.Online.delivery) %>%
  summarise(count = n()) %>%
  mutate(percentage = (count / sum(count)) * 100)

# Print the percentage of restaurants that offer online delivery
print(online_delivery_percentage)
## # A tibble: 2 × 3
##   Has.Online.delivery count percentage
##   <chr>               <int>      <dbl>
## 1 No                  13909       70.6
## 2 Yes                  5801       29.4
# Step 2: Compare the Average Ratings of Restaurants With and Without Online Delivery

# Calculate the average rating for restaurants with and without online delivery
avg_ratings_online_delivery <- D_A_cuisines %>%
  group_by(Has.Online.delivery) %>%
  summarise(average_rating = mean(Aggregate.rating, na.rm = TRUE))

# Print the comparison of average ratings
print(avg_ratings_online_delivery)
## # A tibble: 2 × 2
##   Has.Online.delivery average_rating
##   <chr>                        <dbl>
## 1 No                            2.68
## 2 Yes                           3.31
# Step 1 Analyze the Distribution of Aggregate Ratings

# Create a histogram of the aggregate ratings
ggplot(D_A_cuisines, aes(x = Aggregate.rating)) +
  geom_histogram(binwidth = 0.5, fill = "skyblue", color = "black") +
  labs(title = "Distribution of Aggregate Ratings", x = "Aggregate Rating", y = "Count of Restaurants") +
  theme_minimal()

# Step 1.2: Identify the most common rating range
rating_range <- D_A_cuisines %>%
  group_by(Aggregate.rating)%>%
  summarise(count = n()) %>%
  arrange(desc(count)) 
most_common_rating_range <- rating_range[1,]
print(most_common_rating_range)
## # A tibble: 1 × 2
##   Aggregate.rating count
##              <dbl> <int>
## 1                0  3443
# Step 2: Calculate the Average Number of Votes Received by Restaurants

# Calculate the average number of votes
Average_votes <- D_A_cuisines %>%
  summarise(average_votes = mean(Votes, na.rm = TRUE))


# Step 1: Identify the Most Common Combinations of Cuisines
# Count how many times each cuisine combination appears
Common_cuisines_combination <- D_A_cuisines %>%
  group_by(Cuisines)%>%
  summarise(count = n())%>%
  arrange(desc(count)) 
print(Common_cuisines_combination)
## # A tibble: 145 × 2
##    Cuisines     count
##    <chr>        <int>
##  1 North Indian  3960
##  2 Chinese       2735
##  3 Fast Food     1986
##  4 Mughlai        995
##  5 Italian        764
##  6 Bakery         745
##  7 Continental    736
##  8 Cafe           703
##  9 Desserts       653
## 10 South Indian   636
## # ℹ 135 more rows
library(sf)
## Linking to GEOS 3.12.1, GDAL 3.8.4, PROJ 9.3.1; sf_use_s2() is TRUE
# Step 2: Determine if Certain Cuisine Combinations Tend to Have Higher Ratings
# Calculate the average aggregate rating for each cuisine combination
cuisines_combinations_ratings <- D_A_cuisines %>%
  group_by(Cuisines)%>%
  summarise(Avg_rating = mean(Aggregate.rating, count = n()))%>%
  arrange(desc(Avg_rating))

# location of restaurants on a map
# Convert to sf object using longitude and latitude
restaurants_sf <- st_as_sf(D_A_cuisines, coords = c("Longitude", "Latitude"), crs = 4326)

# Create the plot
ggplot() +
  geom_sf(data = restaurants_sf) +
  theme_minimal() +
  labs(title = "Restaurant Locations", x = "Longitude", y = "Latitude")

# Step 1: Identify restaurant chains (restaurants with multiple locations)
restaurant_chains <- D_A_cuisines %>%
  group_by(Restaurant.Name) %>%
  summarise(
    location_count = n_distinct(Restaurant.ID),  # Count distinct Restaurant IDs
    avg_rating = mean(Aggregate.rating),  # Calculate average rating
    total_votes = sum(Votes)  # Sum of votes (popularity)
  ) %>%
  filter(location_count > 1)  # Keep only restaurant chains (more than 1 location)


# Step 2: View the result
print(restaurant_chains)
## # A tibble: 734 × 4
##    Restaurant.Name           location_count avg_rating total_votes
##    <chr>                              <int>      <dbl>       <int>
##  1 10 Downing Street                      2       4           1340
##  2 221 B Baker Street                     3       3.37         215
##  3 34 Parkstreet Lane                     2       3.05          31
##  4 34, Chowringhee Lane                  12       2.39         777
##  5 4700BC Popcorn                         2       3.5          176
##  6 6 Pack Momos                           2       1.4            8
##  7 A Piece of Paris                       2       3.75         162
##  8 AB's - Absolute Barbecues              4       4.82       40200
##  9 AB's Absolute Barbecues                2       4.85        6302
## 10 Aap Ki Khatir                          2       0              0
## # ℹ 724 more rows
# You can sort by average rating or total votes to find the most popular chains
top_rated_chains <- restaurant_chains %>%
  arrange(desc(avg_rating))  # Sort by average rating

print(top_rated_chains)
## # A tibble: 734 × 4
##    Restaurant.Name           location_count avg_rating total_votes
##    <chr>                              <int>      <dbl>       <int>
##  1 Talaga Sampireun                       3       4.9        11028
##  2 AB's Absolute Barbecues                2       4.85        6302
##  3 Silantro Fil-Mex                       2       4.85        2728
##  4 AB's - Absolute Barbecues              4       4.82       40200
##  5 Naturals Ice Cream                     2       4.8         3094
##  6 Gymkhana                               2       4.7          756
##  7 The Cheesecake Factory                 2       4.65        6020
##  8 Dishoom                                2       4.61        4771
##  9 Chili's                                5       4.6        30215
## 10 Garota de Ipanema                      2       4.6          118
## # ℹ 724 more rows
most_popular_chains <- restaurant_chains %>%
  arrange(desc(total_votes))  # Sort by total votes

print(most_popular_chains)
## # A tibble: 734 × 4
##    Restaurant.Name           location_count avg_rating total_votes
##    <chr>                              <int>      <dbl>       <int>
##  1 Barbeque Nation                       26       4.33       58631
##  2 Big Chill                              4       4.47       43412
##  3 AB's - Absolute Barbecues              4       4.82       40200
##  4 Tea Villa Cafe                         4       3.92       31002
##  5 Chili's                                5       4.6        30215
##  6 Truffles                               2       4.32       29016
##  7 Haldiram's                            16       3.63       28445
##  8 Pirates of Grill                       4       4.03       27342
##  9 Subway                                63       2.91       24496
## 10 Out Of The Box                         2       3.89       23209
## # ℹ 724 more rows